{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading Data into Pandas" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# conventional way to import pandas\n", "import pandas as pd " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read CSV" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# read data from csv file \n", "corona = pd.read_csv(\"../data/covid-19_cleaned_data.csv\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Province/StateCountry/RegionLatLongDateConfirmedDeathsRecovered
0ThailandThailand15.0000101.00002020-01-22200
1JapanJapan36.0000138.00002020-01-22200
2SingaporeSingapore1.2833103.83332020-01-22000
3NepalNepal28.166784.25002020-01-22000
4MalaysiaMalaysia2.5000112.50002020-01-22000
\n", "
" ], "text/plain": [ " Province/State Country/Region Lat Long Date Confirmed \\\n", "0 Thailand Thailand 15.0000 101.0000 2020-01-22 2 \n", "1 Japan Japan 36.0000 138.0000 2020-01-22 2 \n", "2 Singapore Singapore 1.2833 103.8333 2020-01-22 0 \n", "3 Nepal Nepal 28.1667 84.2500 2020-01-22 0 \n", "4 Malaysia Malaysia 2.5000 112.5000 2020-01-22 0 \n", "\n", " Deaths Recovered \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Examine first few rows \n", "corona.head() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Excel Sheet" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# read data from excel file \n", "movies = pd.read_excel(\"../data/movies.xls\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleYearGenresLanguageCountryContent RatingDurationAspect RatioBudgetGross Earnings...Facebook Likes - Actor 1Facebook Likes - Actor 2Facebook Likes - Actor 3Facebook Likes - cast TotalFacebook likes - MovieFacenumber in postersUser VotesReviews by UsersReviews by CrtiicsIMDB Score
0Intolerance: Love's Struggle Throughout the Ages1916Drama|History|WarNaNUSANot Rated1231.33385907.0NaN...436229.04816911107188869.08.0
1Over the Hill to the Poorhouse1920Crime|DramaNaNUSANaN1101.33100000.03000000.0...220.0401511.04.8
2The Big Parade1925Drama|Romance|WarNaNUSANot Rated1511.33245000.0NaN...81126.0108226048494548.08.3
3Metropolis1927Drama|Sci-FiGermanGermanyNot Rated1451.336000000.026435.0...1362318.0203120001111841413260.08.3
4Pandora's Box1929Crime|Drama|RomanceGermanGermanyNot Rated1101.33NaN9950.0...426203.0455926174318471.08.0
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " Title Year \\\n", "0 Intolerance: Love's Struggle Throughout the Ages  1916 \n", "1 Over the Hill to the Poorhouse  1920 \n", "2 The Big Parade  1925 \n", "3 Metropolis  1927 \n", "4 Pandora's Box  1929 \n", "\n", " Genres Language Country Content Rating Duration \\\n", "0 Drama|History|War NaN USA Not Rated 123 \n", "1 Crime|Drama NaN USA NaN 110 \n", "2 Drama|Romance|War NaN USA Not Rated 151 \n", "3 Drama|Sci-Fi German Germany Not Rated 145 \n", "4 Crime|Drama|Romance German Germany Not Rated 110 \n", "\n", " Aspect Ratio Budget Gross Earnings ... Facebook Likes - Actor 1 \\\n", "0 1.33 385907.0 NaN ... 436 \n", "1 1.33 100000.0 3000000.0 ... 2 \n", "2 1.33 245000.0 NaN ... 81 \n", "3 1.33 6000000.0 26435.0 ... 136 \n", "4 1.33 NaN 9950.0 ... 426 \n", "\n", " Facebook Likes - Actor 2 Facebook Likes - Actor 3 \\\n", "0 22 9.0 \n", "1 2 0.0 \n", "2 12 6.0 \n", "3 23 18.0 \n", "4 20 3.0 \n", "\n", " Facebook Likes - cast Total Facebook likes - Movie Facenumber in posters \\\n", "0 481 691 1 \n", "1 4 0 1 \n", "2 108 226 0 \n", "3 203 12000 1 \n", "4 455 926 1 \n", "\n", " User Votes Reviews by Users Reviews by Crtiics IMDB Score \n", "0 10718 88 69.0 8.0 \n", "1 5 1 1.0 4.8 \n", "2 4849 45 48.0 8.3 \n", "3 111841 413 260.0 8.3 \n", "4 7431 84 71.0 8.0 \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine first few rows \n", "movies.head() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Multiple Excel Sheets " ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import xlrd \n", "# Import xlsx file and store each sheet in to a df list\n", "xl_file = pd.ExcelFile(\"../data/data.xls\",)\n", "# Dictionary comprehension\n", "dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Data from each sheet can be accessed via key\n", "keylist = list(dfs.keys())" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['2020-03-13-03-30',\n", " '2020-03-13-00-00',\n", " '2020-03-12-22-00',\n", " '2020-03-12-21-30',\n", " '2020-03-12-21-00',\n", " '2020-03-12-20-00',\n", " '2020-03-12-18-30',\n", " '2020-03-12-17-00',\n", " '2020-03-12-15-30']" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Examine the sheet name \n", "keylist[1:10]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Province/StateCountry/RegionLast UpdateConfirmedDeathsRecovered
0HubeiMainland China3/13/2020 06:0067786306251553
1GuangdongMainland China3/13/2020 06:00135681296
2ZhejiangMainland China3/13/2020 06:00121511209
3ShandongMainland China3/13/2020 06:007607739
4HenanMainland China3/13/2020 06:001273221249
.....................
216NaNMongolia3/13/2020 06:00100
217NaNSt. Barth3/13/2020 06:00100
218NaNSt. Vincent Grenadines3/13/2020 06:00100
219NaNTogo3/13/2020 06:00100
220NaNTrinidad and Tobago3/13/2020 06:00100
\n", "

221 rows × 6 columns

\n", "
" ], "text/plain": [ " Province/State Country/Region Last Update Confirmed \\\n", "0 Hubei Mainland China 3/13/2020 06:00 67786 \n", "1 Guangdong Mainland China 3/13/2020 06:00 1356 \n", "2 Zhejiang Mainland China 3/13/2020 06:00 1215 \n", "3 Shandong Mainland China 3/13/2020 06:00 760 \n", "4 Henan Mainland China 3/13/2020 06:00 1273 \n", ".. ... ... ... ... \n", "216 NaN Mongolia 3/13/2020 06:00 1 \n", "217 NaN St. Barth 3/13/2020 06:00 1 \n", "218 NaN St. Vincent Grenadines 3/13/2020 06:00 1 \n", "219 NaN Togo 3/13/2020 06:00 1 \n", "220 NaN Trinidad and Tobago 3/13/2020 06:00 1 \n", "\n", " Deaths Recovered \n", "0 3062 51553 \n", "1 8 1296 \n", "2 1 1209 \n", "3 7 739 \n", "4 22 1249 \n", ".. ... ... \n", "216 0 0 \n", "217 0 0 \n", "218 0 0 \n", "219 0 0 \n", "220 0 0 \n", "\n", "[221 rows x 6 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Accessing first sheet\n", "dfs[keylist[0]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## From URL" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame \n", "orders = pd.read_table('http://bit.ly/chiporders')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "0 1 1 Chips and Fresh Tomato Salsa \n", "1 1 1 Izze \n", "2 1 1 Nantucket Nectar \n", "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", "4 2 2 Chicken Bowl \n", "\n", " choice_description item_price \n", "0 NaN $2.39 \n", "1 [Clementine] $3.39 \n", "2 [Apple] $3.39 \n", "3 NaN $2.39 \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the first 5 rows \n", "orders.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
461718331Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...$11.75
461818331Steak Burrito[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...$11.75
461918341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$11.25
462018341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...$8.75
462118341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$8.75
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "4617 1833 1 Steak Burrito \n", "4618 1833 1 Steak Burrito \n", "4619 1834 1 Chicken Salad Bowl \n", "4620 1834 1 Chicken Salad Bowl \n", "4621 1834 1 Chicken Salad Bowl \n", "\n", " choice_description item_price \n", "4617 [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75 \n", "4618 [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75 \n", "4619 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25 \n", "4620 [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75 \n", "4621 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the last 5 rows \n", "orders.tail()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
531Chicken Bowl[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...$10.98
631Side of ChipsNaN$1.69
741Steak Burrito[Tomatillo Red Chili Salsa, [Fajita Vegetables...$11.75
841Steak Soft Tacos[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...$9.25
951Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...$9.25
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "0 1 1 Chips and Fresh Tomato Salsa \n", "1 1 1 Izze \n", "2 1 1 Nantucket Nectar \n", "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", "4 2 2 Chicken Bowl \n", "5 3 1 Chicken Bowl \n", "6 3 1 Side of Chips \n", "7 4 1 Steak Burrito \n", "8 4 1 Steak Soft Tacos \n", "9 5 1 Steak Burrito \n", "\n", " choice_description item_price \n", "0 NaN $2.39 \n", "1 [Clementine] $3.39 \n", "2 [Apple] $3.39 \n", "3 NaN $2.39 \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", "5 [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98 \n", "6 NaN $1.69 \n", "7 [Tomatillo Red Chili Salsa, [Fajita Vegetables... $11.75 \n", "8 [Tomatillo Green Chili Salsa, [Pinto Beans, Ch... $9.25 \n", "9 [Fresh Tomato Salsa, [Rice, Black Beans, Pinto... $9.25 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the first `n` number of rows\n", "orders.head(10)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
461218311Carnitas Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...$9.25
461318311ChipsNaN$2.15
461418311Bottled WaterNaN$1.50
461518321Chicken Soft Tacos[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream]]$8.75
461618321Chips and GuacamoleNaN$4.45
461718331Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...$11.75
461818331Steak Burrito[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...$11.75
461918341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$11.25
462018341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...$8.75
462118341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$8.75
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "4612 1831 1 Carnitas Bowl \n", "4613 1831 1 Chips \n", "4614 1831 1 Bottled Water \n", "4615 1832 1 Chicken Soft Tacos \n", "4616 1832 1 Chips and Guacamole \n", "4617 1833 1 Steak Burrito \n", "4618 1833 1 Steak Burrito \n", "4619 1834 1 Chicken Salad Bowl \n", "4620 1834 1 Chicken Salad Bowl \n", "4621 1834 1 Chicken Salad Bowl \n", "\n", " choice_description item_price \n", "4612 [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... $9.25 \n", "4613 NaN $2.15 \n", "4614 NaN $1.50 \n", "4615 [Fresh Tomato Salsa, [Rice, Cheese, Sour Cream]] $8.75 \n", "4616 NaN $4.45 \n", "4617 [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75 \n", "4618 [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75 \n", "4619 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25 \n", "4620 [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75 \n", "4621 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the last `n` number of rows\n", "orders.tail(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modify Dataset" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# read a dataset of movie reviewers (modifying the default parameter values for read_table)\n", "users = pd.read_table('http://bit.ly//movieusers')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1|24|M|technician|85711
02|53|F|other|94043
13|23|M|writer|32067
24|24|M|technician|43537
35|33|F|other|15213
46|42|M|executive|98101
\n", "
" ], "text/plain": [ " 1|24|M|technician|85711\n", "0 2|53|F|other|94043\n", "1 3|23|M|writer|32067\n", "2 4|24|M|technician|43537\n", "3 5|33|F|other|15213\n", "4 6|42|M|executive|98101" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the first 5 rows \n", "users.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# DataFrame looks ugly. let's modify the default parameter for read_table \n", "user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n", "users = pd.read_table('http://bit.ly//movieusers', sep='|' , header=None, names=user_cols)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idagegenderoccupationzip_code
0124Mtechnician85711
1253Fother94043
2323Mwriter32067
3424Mtechnician43537
4533Fother15213
\n", "
" ], "text/plain": [ " user_id age gender occupation zip_code\n", "0 1 24 M technician 85711\n", "1 2 53 F other 94043\n", "2 3 23 M writer 32067\n", "3 4 24 M technician 43537\n", "4 5 33 F other 15213" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now take a look at modified dataset\n", "users.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Biological Data(.txt)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# read text/csv data into pandas \n", "chrom = pd.read_csv(\"../data/Encode_HMM_data.txt\", delimiter=\"\\t\", header=None)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
0chr1100001060015_Repetitive/CNV0.1000010600245,245,245
1chr1106001113713_Heterochrom/lo0.1060011137245,245,245
2chr111137117378_Insulator0.111371173710,190,254
3chr1117371193711_Weak_Txn0.1173711937153,255,102
4chr111937121377_Weak_Enhancer0.1193712137255,252,4
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8\n", "0 chr1 10000 10600 15_Repetitive/CNV 0 . 10000 10600 245,245,245\n", "1 chr1 10600 11137 13_Heterochrom/lo 0 . 10600 11137 245,245,245\n", "2 chr1 11137 11737 8_Insulator 0 . 11137 11737 10,190,254\n", "3 chr1 11737 11937 11_Weak_Txn 0 . 11737 11937 153,255,102\n", "4 chr1 11937 12137 7_Weak_Enhancer 0 . 11937 12137 255,252,4" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Examine first few rows \n", "chrom.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# it's not much better to see. so we have to modify this dataset\n", "cols_name = ['chrom', 'start', 'stop', 'type']\n", "chrom = pd.read_csv(\"../data/Encode_HMM_data.txt\", delimiter=\"\\t\", header=None, names=cols_name)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromstartstoptype
chr1100001060015_Repetitive/CNV0.1000010600245,245,245
106001113713_Heterochrom/lo0.1060011137245,245,245
11137117378_Insulator0.111371173710,190,254
117371193711_Weak_Txn0.1173711937153,255,102
11937121377_Weak_Enhancer0.1193712137255,252,4
\n", "
" ], "text/plain": [ " chrom start stop type\n", "chr1 10000 10600 15_Repetitive/CNV 0 . 10000 10600 245,245,245\n", " 10600 11137 13_Heterochrom/lo 0 . 10600 11137 245,245,245\n", " 11137 11737 8_Insulator 0 . 11137 11737 10,190,254\n", " 11737 11937 11_Weak_Txn 0 . 11737 11937 153,255,102\n", " 11937 12137 7_Weak_Enhancer 0 . 11937 12137 255,252,4" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now examine first few rows \n", "chrom.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read Biological Data(.tsv)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "pokemon = pd.read_csv(\"../data/pokemon.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NumberNameType_1Type_2TotalHPAttackDefenseSp_AtkSp_Def...ColorhasGenderPr_MaleEgg_Group_1Egg_Group_2hasMegaEvolutionHeight_mWeight_kgCatch_RateBody_Style
01BulbasaurGrassPoison3184549496565...GreenTrue0.875MonsterGrassFalse0.716.945quadruped
12IvysaurGrassPoison4056062638080...GreenTrue0.875MonsterGrassFalse0.9913.045quadruped
23VenusaurGrassPoison525808283100100...GreenTrue0.875MonsterGrassTrue2.01100.045quadruped
34CharmanderFireNaN3093952436050...RedTrue0.875MonsterDragonFalse0.618.545bipedal_tailed
45CharmeleonFireNaN4055864588065...RedTrue0.875MonsterDragonFalse1.0919.045bipedal_tailed
\n", "

5 rows × 23 columns

\n", "
" ], "text/plain": [ " Number Name Type_1 Type_2 Total HP Attack Defense Sp_Atk \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 65 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 80 \n", "2 3 Venusaur Grass Poison 525 80 82 83 100 \n", "3 4 Charmander Fire NaN 309 39 52 43 60 \n", "4 5 Charmeleon Fire NaN 405 58 64 58 80 \n", "\n", " Sp_Def ... Color hasGender Pr_Male Egg_Group_1 Egg_Group_2 \\\n", "0 65 ... Green True 0.875 Monster Grass \n", "1 80 ... Green True 0.875 Monster Grass \n", "2 100 ... Green True 0.875 Monster Grass \n", "3 50 ... Red True 0.875 Monster Dragon \n", "4 65 ... Red True 0.875 Monster Dragon \n", "\n", " hasMegaEvolution Height_m Weight_kg Catch_Rate Body_Style \n", "0 False 0.71 6.9 45 quadruped \n", "1 False 0.99 13.0 45 quadruped \n", "2 True 2.01 100.0 45 quadruped \n", "3 False 0.61 8.5 45 bipedal_tailed \n", "4 False 1.09 19.0 45 bipedal_tailed \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pokemon.head() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read HTML Data" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# Read HTML data from web \n", "url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'\n", "data = pd.io.html.read_html(url)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check type \n", "type(data)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bank NameCitySTCERTAcquiring InstitutionClosing Date
0The First State BankBarboursvilleWV14361MVB Bank, Inc.April 3, 2020
1Ericson State BankEricsonNE18265Farmers and Merchants BankFebruary 14, 2020
2City National Bank of New JerseyNewarkNJ21111Industrial BankNovember 1, 2019
3Resolute BankMaumeeOH58317Buckeye State BankOctober 25, 2019
4Louisa Community BankLouisaKY58112Kentucky Farmers Bank CorporationOctober 25, 2019
.....................
556Superior Bank, FSBHinsdaleIL32646Superior Federal, FSBJuly 27, 2001
557Malta National BankMaltaOH6629North Valley BankMay 3, 2001
558First Alliance Bank & Trust Co.ManchesterNH34264Southern New Hampshire Bank & TrustFebruary 2, 2001
559National State Bank of MetropolisMetropolisIL3815Banterra Bank of MarionDecember 14, 2000
560Bank of HonoluluHonoluluHI21029Bank of the OrientOctober 13, 2000
\n", "

561 rows × 6 columns

\n", "
" ], "text/plain": [ " Bank Name City ST CERT \\\n", "0 The First State Bank Barboursville WV 14361 \n", "1 Ericson State Bank Ericson NE 18265 \n", "2 City National Bank of New Jersey Newark NJ 21111 \n", "3 Resolute Bank Maumee OH 58317 \n", "4 Louisa Community Bank Louisa KY 58112 \n", ".. ... ... .. ... \n", "556 Superior Bank, FSB Hinsdale IL 32646 \n", "557 Malta National Bank Malta OH 6629 \n", "558 First Alliance Bank & Trust Co. Manchester NH 34264 \n", "559 National State Bank of Metropolis Metropolis IL 3815 \n", "560 Bank of Honolulu Honolulu HI 21029 \n", "\n", " Acquiring Institution Closing Date \n", "0 MVB Bank, Inc. April 3, 2020 \n", "1 Farmers and Merchants Bank February 14, 2020 \n", "2 Industrial Bank November 1, 2019 \n", "3 Buckeye State Bank October 25, 2019 \n", "4 Kentucky Farmers Bank Corporation October 25, 2019 \n", ".. ... ... \n", "556 Superior Federal, FSB July 27, 2001 \n", "557 North Valley Bank May 3, 2001 \n", "558 Southern New Hampshire Bank & Trust February 2, 2001 \n", "559 Banterra Bank of Marion December 14, 2000 \n", "560 Bank of the Orient October 13, 2000 \n", "\n", "[561 rows x 6 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# access data \n", "data[0]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }